#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from __future__ import division, with_statement
'''
Copyright 2015, 陈同 (chentong_biology@163.com).  
===========================================================
'''
__author__ = 'chentong & ct586[9]'
__author_email__ = 'chentong_biology@163.com'
#=========================================================
desc = '''
Program description:
    This is used to annotate DEseq2.sh output files.

<all.DE> file (old)
ENSG00000242071 fOocyte._vs_.M1_up
ENSG00000228929 fOocyte._vs_.M1_up
ENSG00000213553 fOocyte._vs_.M1_up
ENSG00000224411 fOocyte._vs_.M1_up
ENSG00000283041 fOocyte._vs_.M1_up
ENSG00000168028 fOocyte._vs_.M1_up

<all.DE> file (new)
ENSG00000242071 fOocyte._lowerThan_.M1
ENSG00000228929 fOocyte._lowerThan_.M1
ENSG00000213553 fOocyte._lowerThan_.M1
ENSG00000224411 fOocyte._higherThan_.M1
ENSG00000283041 fOocyte._higherThan_.M1
ENSG00000168028 fOocyte._higherThan_.M1



1. The prefix will be extracted from <all.DE> file by excluding all.DE suffix.

2. The second column will be used to select DE files and annotate differentially expressed genes.

'''

import sys
import os
from json import dumps as json_dumps
from time import localtime, strftime 
timeformat = "%Y-%m-%d %H:%M:%S"
from optparse import OptionParser as OP
#from multiprocessing.dummy import Pool as ThreadPool
import pandas as pd

#from bs4 import BeautifulSoup
#reload(sys)
#sys.setdefaultencoding('utf8')

debug = 0

def fprint(content):
    """ 
    This is a Google style docs.

    Args:
        param1(str): this is the first param
        param2(int, optional): this is a second param
            
    Returns:
        bool: This is a description of what is returned
            
    Raises:
        KeyError: raises an exception))
    """
    print json_dumps(content,indent=1)

def cmdparameter(argv):
    if len(argv) == 1:
        global desc
        print >>sys.stderr, desc
        cmd = 'python ' + argv[0] + ' -h'
        os.system(cmd)
        sys.exit(1)
    usages = "%prog -i file"
    parser = OP(usage=usages)
    parser.add_option("-i", "--input-file", dest="filein",
        metavar="FILEIN", help="<all.DE> file generated by DESeq2.sh")
    #parser.add_option("-t", "--input-type", dest="type",
    #    help="Ty")
    parser.add_option("-a", "--anno", dest="anno",
        help="Annotation file. First column will be used as index.")
    parser.add_option("-v", "--verbose", dest="verbose",
        action="store_true", help="Show process information")
    parser.add_option("-D", "--debug", dest="debug",
        default=False, action="store_true", help="Debug the program")
    (options, args) = parser.parse_args(argv[1:])
    assert options.filein != None, "A filename needed for -i"
    return (options, args)
#--------------------------------------------------------------------


def main():
    options, args = cmdparameter(sys.argv)
    #-----------------------------------
    file = options.filein
    anno = options.anno
    annoM = pd.read_table(anno, header=0, index_col=0)
    verbose = options.verbose
    global debug
    debug = options.debug
    #-----------------------------------
    prefix = file.replace("all.DE", "")
    normalized = prefix + 'normalized.xls'
    normalizedOut = prefix + 'normalized.anno.xls'
    mat = pd.read_table(normalized, header=0, index_col=0)
    rownames = mat.index
    mat = mat.join(annoM, how="left")
    mat = mat.fillna(b'Unknown')
    mat = mat.loc[rownames,:]
    mat.index.name = "ID"
    mat.to_csv(normalizedOut, sep=b"\t")

    normalized = prefix + 'normalized.rlog.xls'
    normalizedOut = prefix + 'normalized.rlog.anno.xls'
    mat = pd.read_table(normalized, header=0, index_col=0)
    rownames = mat.index
    mat = mat.join(annoM, how="left")
    mat = mat.fillna(b'Unknown')
    mat = mat.loc[rownames,:]
    mat.index.name = "ID"
    mat.to_csv(normalizedOut, sep=b"\t")

    type = 'new'
    if type == 'old':
        de_grp = set([line.split()[1] for line in open(file)])
        de_grpL = [i.rsplit('_', 1) for i in de_grp]
        for de_grp_name, de_grp_type in de_grpL:
            results = prefix+de_grp_name+'.results'
            resultsOut = results+'.anno.xls'
            mat = pd.read_table(results, header=0, index_col=0)
            rownames = mat.index
            mat = mat.join(annoM, how="left")
            mat = mat.fillna(b'Unknown')
            mat = mat.loc[rownames,:]
            mat.index.name = "ID"
            mat.to_csv(resultsOut, sep=b"\t")
            results = results + '.DE_'+de_grp_type
            resultsOut = results+'.anno.xls'
            mat = pd.read_table(results, header=0, index_col=0)
            rownames = mat.index
            mat = mat.join(annoM, how="left")
            mat = mat.fillna(b'Unknown')
            mat = mat.loc[rownames,:]
            mat.index.name = "ID"
            mat.to_csv(resultsOut, sep=b"\t")
    elif type == 'new':
        de_grpL = []
        for line in open(file):
            name = line.split()[1].replace('._higherThan_.', '____')
            name = name.replace('._lowerThan_.', '____')
            nameL = name.split('____')
            if nameL not in de_grpL:
                de_grpL.append(nameL)
        #-----------------------------------------
        for de_grp_name in de_grpL:
            results = prefix+'._vs_.'.join(de_grp_name)+'.results.xls'
            resultsOut = results+'.anno.xls'
            mat = pd.read_table(results, header=0, index_col=0)
            rownames = mat.index
            mat = mat.join(annoM, how="left")
            mat = mat.fillna(b'Unknown')
            mat = mat.loc[rownames,:]
            mat.index.name = "ID"
            mat.to_csv(resultsOut, sep=b"\t")
            for type in ['._higherThan_.', '._lowerThan_.']:
                ht = prefix + type.join(de_grp_name) + '.xls'
                if os.path.exists(ht):
                    resultsOut = ht+'.anno.xls'
                    mat = pd.read_table(results, header=0, index_col=0)
                    rownames = mat.index
                    mat = mat.join(annoM, how="left")
                    mat = mat.fillna(b'Unknown')
                    mat = mat.loc[rownames,:]
                    mat.index.name = "ID"
                    mat.to_csv(resultsOut, sep=b"\t")
    #-------------END reading file----------
    #----close file handle for files-----
    #-----------end close fh-----------
    ###--------multi-process------------------
    #pool = ThreadPool(5) # 5 represents thread_num
    #result = pool.map(func, iterable_object)
    #pool.close()
    #pool.join()
    ###--------multi-process------------------
    if verbose:
        print >>sys.stderr,\
            "--Successful %s" % strftime(timeformat, localtime())

if __name__ == '__main__':
    startTime = strftime(timeformat, localtime())
    main()
    endTime = strftime(timeformat, localtime())
    fh = open('python.log', 'a')
    print >>fh, "%s\n\tRun time : %s - %s " % \
        (' '.join(sys.argv), startTime, endTime)
    fh.close()
    ###---------profile the program---------
    #import profile
    #profile_output = sys.argv[0]+".prof.txt")
    #profile.run("main()", profile_output)
    #import pstats
    #p = pstats.Stats(profile_output)
    #p.sort_stats("time").print_stats()
    ###---------profile the program---------


